#load data
my_data <- read.delim("proteinGroups.txt")
data <-my_data
experimental_design <- read.delim("20231210_experiment_OE.txt")
# We filter for contaminant proteins and decoy database hits, which are indicated by "+" in the columns "Potential.contaminants" and "Reverse", respectively.
data <- filter(data, Reverse != "+", Potential.contaminant != "+")
#Are there any duplicated gene names?
data$Gene.names %>% duplicated() %>% any()
## [1] TRUE
#Make a table of duplicated gene names
data %>% group_by(Gene.names) %>% summarize(frequency = n()) %>%
arrange(desc(frequency)) %>% filter(frequency > 1)
## # A tibble: 1 × 2
## Gene.names frequency
## <chr> <int>
## 1 "" 13
#For further analysis these proteins must get unique names. Additionally,
#some proteins do not have an annotated gene name and for those we will use the Uniprot ID.
#Make unique names using the annotation in the "Gene.names" column as primary names and the annotation in "Protein.IDs" as name for those that do not have an gene name.
data_unique <- make_unique(data, "Gene.names", "Protein.IDs", delim = ";")
#Are there any duplicated names?
data$name %>% duplicated() %>% any()
## [1] FALSE
#Generate a SummarizedExperiment object using an experimental design
LFQ_columns <- grep("LFQ.", colnames(data_unique)) # get LFQ column numbers
data_se <- make_se(data_unique, LFQ_columns, experimental_design)
#Generate a SummarizedExperiment object by parsing condition information from the column names
LFQ_columns <- grep("LFQ.", colnames(data_unique)) # get LFQ column numbers
data_se_parsed <- make_se_parse(data_unique, LFQ_columns)
#Let's have a look at the SummarizedExperiment object
data_se
## class: SummarizedExperiment
## dim: 425 48
## metadata(0):
## assays(1): ''
## rownames(425): FBLL1 DS1_GFP ... CFAP20 IGF2BP2
## rowData names(498): Protein.IDs Majority.protein.IDs ... name ID
## colnames(48): parental_1 parental_2 ... cand19_5 cand19_6
## colData names(4): label ID condition replicate
# Plot a barplot of the protein identification overlap between samples
plot_frequency(data_se)
#filter_missval filters a proteomics dataset based on missing values.
#The dataset is filtered for proteins that have a maximum of ’thr’ missing values in at least one condition.
#IF filter for proteins that are identified in all replicates of at least one condition:
#data_filt <- filter_missval(data_se, thr = 0)
# OR FOR Less stringent filtering:
#Filter for proteins that are identified in 3 out of 6 replicates of at least one condition
data_filt <- filter_missval(data_se, thr = 3)
#After filtering, the number of identified proteins per sample can be plotted as well as the overlap in identifications between samples.
# Plot a barplot of the number of identified proteins per samples
plot_numbers(data_filt)
# Plot a barplot of the protein identification overlap between samples
plot_coverage(data_filt)
#The data is background corrected and normalized by variance stabilizing transformation (vsn).
# Normalize the data
data_norm <- normalize_vsn(data_filt)
#The normalization can be inspected by checking the distributions of the samples before and after normalization.
# Visualize normalization by boxplots for all samples before and after normalization
plot_normalization(data_filt, data_norm)
#-> looks worse with normalization than without I think, so don't normalize, however GFP linker replicates seem badly aligned in data_fil
#Plot a heatmap of proteins with missing values
plot_missval(data_filt)
# Plot intensity distributions and cumulative fraction of proteins with and without missing values
plot_detect(data_filt)
data_imp <- impute(data_filt, fun = "min") #without norm, if want norm, need to impute "data_norm"
# if want a different method, all possible imputation methods are printed in an error, if an invalid function name is given.
#impute(data_filt, fun = "")
#more explanation: https://rdrr.io/bioc/MSnbase/man/impute-methods.html
plot_imputation(data_filt, data_imp) #without norm, if want norm, need to impute "data_norm"
## Differential enrichment analysis based on linear models and
empherical Bayes statistics ### parental vs rest LFC = 1.5
# Test every sample versus control
data_diff <- test_diff(data_imp, type = "control", control = "parental")
## Tested contrasts: GFP_link_vs_parental, cand1_vs_parental, cand2_vs_parental, cand11_vs_parental, cand12_vs_parental, cand16_vs_parental, cand19_vs_parental
##Tested contrasts: GFP_link_vs_parental, cand1_vs_parental, cand2_vs_parental, cand11_vs_parental, cand12_vs_parental, cand16_vs_parental, cand19_vs_parental
#otherwise also possible to test all possible comparisons of samples:
#data_diff_all_contrasts <- test_diff(data_imp, type = "all")
#or to test manually defined comparisons
#data_diff_manual <- test_diff(data_imp, type = "manual",
# test = c("RPL11_vs_cand1", "cand1_vs_RPL11"))
##Finally, significant proteins are defined by user-defined cutoffs using add_rejections.
# Denote significant proteins based on user defined cutoffs, vs control
dep <- add_rejections(data_diff, alpha = 0.05, lfc = log2(1.5))
#OR
# Denote significant proteins based on user defined cutoffs, vs all
#dep <- add_rejections(data_diff_all_contrasts, alpha = 0.05, lfc = log2(1.5))
# Plot the first and second principal components
plot_pca(dep, x = 1, y = 2, n = 54, point_size = 4)
## Warning: Use of `pca_df[[indicate[1]]]` is discouraged.
## ℹ Use `.data[[indicate[1]]]` instead.
## Warning: Use of `pca_df[[indicate[2]]]` is discouraged.
## ℹ Use `.data[[indicate[2]]]` instead.
# Plot the Pearson correlation matrix
plot_cor(dep, significant = TRUE, lower = 0, upper = 1, pal = "Reds")
# Plot a heatmap of all significant proteins with the data centered per protein
plot_heatmap(dep, type = "centered", kmeans = TRUE,
k = 3, col_limit = 4, show_row_names = FALSE,
indicate = c("condition", "replicate"))
# Plot a heatmap of all significant proteins (rows) and the tested contrasts (columns)
plot_heatmap(dep, type = "contrast", kmeans = TRUE,
k = 3, col_limit = 10, show_row_names = FALSE)
# Plot a volcano plot for the contrast " vs Ctrl""
#pdf('20231210_allvsparental_filt3out6reps_NOnorm_imp_min_lfc_1_5.pdf')
plot_volcano(dep, contrast = "GFP_link_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep, contrast = "cand1_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep, contrast = "cand2_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep, contrast = "cand11_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep, contrast = "cand12_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep, contrast = "cand16_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep, contrast = "cand19_vs_parental", label_size = 2, add_names = TRUE)
#dev.off()
# Generate a results table
data_results <- get_results(dep)
#save the results table
#write.table(data_results, "20231210_allvsparental_filt3out6reps_NOnorm_imp_min_lfc_1_5.txt", sep="\t")
# Denote significant proteins based on user defined cutoffs, vs control
dep_2 <- add_rejections(data_diff, alpha = 0.05, lfc = log2(1))
#OR
# Denote significant proteins based on user defined cutoffs, vs all
#dep <- add_rejections(data_diff_all_contrasts, alpha = 0.05, lfc = log2(1.5))
# Plot the first and second principal components
plot_pca(dep_2, x = 1, y = 2, n = 54, point_size = 4)
## Warning: Use of `pca_df[[indicate[1]]]` is discouraged.
## ℹ Use `.data[[indicate[1]]]` instead.
## Warning: Use of `pca_df[[indicate[2]]]` is discouraged.
## ℹ Use `.data[[indicate[2]]]` instead.
# Plot the Pearson correlation matrix
plot_cor(dep_2, significant = TRUE, lower = 0, upper = 1, pal = "Reds")
# Plot a heatmap of all significant proteins with the data centered per protein
plot_heatmap(dep_2, type = "centered", kmeans = TRUE,
k = 3, col_limit = 4, show_row_names = FALSE,
indicate = c("condition", "replicate"))
# Plot a heatmap of all significant proteins (rows) and the tested contrasts (columns)
plot_heatmap(dep_2, type = "contrast", kmeans = TRUE,
k = 3, col_limit = 10, show_row_names = FALSE)
# Plot a volcano plot for the contrast " vs Ctrl""
#pdf('20231027_allvsparental_filt3out6reps_NOnorm_imp_min_lfc_1.pdf')
plot_volcano(dep_2, contrast = "GFP_link_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_2, contrast = "cand1_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_2, contrast = "cand2_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 42 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_2, contrast = "cand11_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_2, contrast = "cand12_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_2, contrast = "cand16_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_2, contrast = "cand19_vs_parental", label_size = 2, add_names = TRUE)
#dev.off()
# Generate a results table
data_results_2 <- get_results(dep_2)
#save the results table
#write.table(data_results_2, "20231210_allvsparental_filt3out6reps_NOnorm_imp_min_lfc_1.txt", sep="\t")
#Plot a heatmap of proteins with missing values
plot_missval(data_norm)
# Plot intensity distributions and cumulative fraction of proteins with and without missing values
plot_detect(data_norm)
data_imp_norm <- impute(data_norm, fun = "min")
# if want a different method, all possible imputation methods are printed in an error, if an invalid function name is given.
#impute(data_filt, fun = "")
#more explanation: https://rdrr.io/bioc/MSnbase/man/impute-methods.html
plot_imputation(data_norm, data_imp_norm)
## Differential enrichment analysis based on linear models and
empherical Bayes statistics ### parental vs rest LFC = 1.5
# Test every sample versus control
data_diff_norm <- test_diff(data_imp_norm, type = "control", control = "parental")
## Tested contrasts: GFP_link_vs_parental, cand1_vs_parental, cand2_vs_parental, cand11_vs_parental, cand12_vs_parental, cand16_vs_parental, cand19_vs_parental
##Tested contrasts: GFP_link_vs_parental, cand1_vs_parental, cand2_vs_parental, cand11_vs_parental, cand12_vs_parental, cand16_vs_parental, cand19_vs_parental
#otherwise also possible to test all possible comparisons of samples:
#data_diff_all_contrasts <- test_diff(data_imp, type = "all")
#or to test manually defined comparisons
#data_diff_manual <- test_diff(data_imp, type = "manual",
# test = c("RPL11_vs_cand1", "cand1_vs_RPL11"))
##Finally, significant proteins are defined by user-defined cutoffs using add_rejections.
# Denote significant proteins based on user defined cutoffs, vs control
dep_norm <- add_rejections(data_diff_norm, alpha = 0.05, lfc = log2(1.5))
#OR
# Denote significant proteins based on user defined cutoffs, vs all
#dep <- add_rejections(data_diff_all_contrasts, alpha = 0.05, lfc = log2(1.5))
# Plot the first and second principal components
plot_pca(dep_norm, x = 1, y = 2, n = 54, point_size = 4)
## Warning: Use of `pca_df[[indicate[1]]]` is discouraged.
## ℹ Use `.data[[indicate[1]]]` instead.
## Warning: Use of `pca_df[[indicate[2]]]` is discouraged.
## ℹ Use `.data[[indicate[2]]]` instead.
# Plot the Pearson correlation matrix
plot_cor(dep_norm, significant = TRUE, lower = 0, upper = 1, pal = "Reds")
# Plot a heatmap of all significant proteins with the data centered per protein
plot_heatmap(dep_norm, type = "centered", kmeans = TRUE,
k = 3, col_limit = 4, show_row_names = FALSE,
indicate = c("condition", "replicate"))
# Plot a heatmap of all significant proteins (rows) and the tested contrasts (columns)
plot_heatmap(dep_norm, type = "contrast", kmeans = TRUE,
k = 3, col_limit = 10, show_row_names = FALSE)
# Plot a volcano plot for the contrast " vs Ctrl""
#pdf('20231210_allvsparental_filt3out6reps_norm_imp_min_lfc_1_5.pdf')
plot_volcano(dep_norm, contrast = "GFP_link_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm, contrast = "cand1_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm, contrast = "cand2_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm, contrast = "cand11_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm, contrast = "cand12_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm, contrast = "cand16_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm, contrast = "cand19_vs_parental", label_size = 2, add_names = TRUE)
#dev.off()
# Generate a results table
data_results_norm <- get_results(dep_norm)
#save the results table
write.table(data_results_norm, "20231210_allvsparental_filt3out6reps_norm_imp_min_lfc_1_5.txt", sep="\t")
# Denote significant proteins based on user defined cutoffs, vs control
dep_norm_2 <- add_rejections(data_diff_norm, alpha = 0.05, lfc = log2(1))
#OR
# Denote significant proteins based on user defined cutoffs, vs all
#dep <- add_rejections(data_diff_all_contrasts, alpha = 0.05, lfc = log2(1.5))
# Plot the first and second principal components
plot_pca(dep_norm_2, x = 1, y = 2, n = 54, point_size = 4)
## Warning: Use of `pca_df[[indicate[1]]]` is discouraged.
## ℹ Use `.data[[indicate[1]]]` instead.
## Warning: Use of `pca_df[[indicate[2]]]` is discouraged.
## ℹ Use `.data[[indicate[2]]]` instead.
# Plot the Pearson correlation matrix
plot_cor(dep_norm_2, significant = TRUE, lower = 0, upper = 1, pal = "Reds")
# Plot a heatmap of all significant proteins with the data centered per protein
plot_heatmap(dep_norm_2, type = "centered", kmeans = TRUE,
k = 3, col_limit = 4, show_row_names = FALSE,
indicate = c("condition", "replicate"))
# Plot a heatmap of all significant proteins (rows) and the tested contrasts (columns)
plot_heatmap(dep_norm_2, type = "contrast", kmeans = TRUE,
k = 3, col_limit = 10, show_row_names = FALSE)
# Plot a volcano plot for the contrast " vs Ctrl""
#pdf('20231210_allvsparental_filt3out6reps_norm_imp_min_lfc_1.pdf')
plot_volcano(dep_norm_2, contrast = "GFP_link_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm_2, contrast = "cand1_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm_2, contrast = "cand2_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm_2, contrast = "cand11_vs_parental", label_size = 2, add_names = TRUE)
plot_volcano(dep_norm_2, contrast = "cand12_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm_2, contrast = "cand16_vs_parental", label_size = 2, add_names = TRUE)
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot_volcano(dep_norm_2, contrast = "cand19_vs_parental", label_size = 2, add_names = TRUE)
#dev.off()
# Generate a results table
data_results_norm_2 <- get_results(dep_norm_2)
#save the results table
#write.table(data_results_norm_2 , "20231210_allvsparental_filt3out6reps_norm_imp_min_lfc_1.txt", sep="\t")